library(tidyverse)
library(corrplot)
library("PerformanceAnalytics")
library(gtsummary)
library(flextable)
library(caret)
library(glmnet)
library(e1071)
food_data <- read_csv("Food_Supply_kcal_Data.csv")
head(food_data)
## # A tibble: 6 x 32
##   Country `Alcoholic Beve… `Animal Product… `Animal fats` `Aquatic Produc…
##   <chr>              <dbl>            <dbl>         <dbl>            <dbl>
## 1 Afghan…           0                  4.78         0.850                0
## 2 Albania           0.912             16.1          1.06                 0
## 3 Algeria           0.0896             6.03         0.194                0
## 4 Angola            1.94               4.69         0.264                0
## 5 Antigu…           2.30              15.4          1.54                 0
## 6 Argent…           1.44              15.0          1.06                 0
## # … with 27 more variables: `Cereals - Excluding Beer` <dbl>, Eggs <dbl>,
## #   `Fish, Seafood` <dbl>, `Fruits - Excluding Wine` <dbl>, Meat <dbl>, `Milk -
## #   Excluding Butter` <dbl>, Miscellaneous <dbl>, Offals <dbl>, Oilcrops <dbl>,
## #   Pulses <dbl>, Spices <dbl>, `Starchy Roots` <dbl>, Stimulants <dbl>, `Sugar
## #   Crops` <dbl>, `Sugar & Sweeteners` <dbl>, Treenuts <dbl>, `Vegetal
## #   Products` <dbl>, `Vegetable Oils` <dbl>, Vegetables <dbl>, Obesity <dbl>,
## #   Undernourished <chr>, Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## #   Active <dbl>, Population <dbl>, `Unit (all except Population)` <chr>
sum(food_data[1,2:24])
## [1] 100.0002
food_data <- food_data  %>%
  drop_na() %>%
  mutate(Confirmed_per_capita = Confirmed/Population * 100000000, Deaths_per_capita = Deaths/Population * 100000000) %>%
  #creating a column that specifies whether each country falls above or below the median
  mutate(Cases_vs_median = as.character(ifelse(Confirmed_per_capita > median(Confirmed_per_capita), "Above", "Below")), 
         Deaths_vs_median = as.character(ifelse(Deaths_per_capita > median(Deaths_per_capita), "Above", "Below")))

food_data <- food_data %>%
  select(-Miscellaneous, -Obesity, -Undernourished, -Recovered, -Active, -Population, -`Unit (all except Population)`, 
         -Confirmed, -Deaths)
head(food_data)
## # A tibble: 6 x 27
##   Country `Alcoholic Beve… `Animal Product… `Animal fats` `Aquatic Produc…
##   <chr>              <dbl>            <dbl>         <dbl>            <dbl>
## 1 Afghan…           0                  4.78         0.850                0
## 2 Albania           0.912             16.1          1.06                 0
## 3 Algeria           0.0896             6.03         0.194                0
## 4 Angola            1.94               4.69         0.264                0
## 5 Argent…           1.44              15.0          1.06                 0
## 6 Armenia           0.227             12.8          1.77                 0
## # … with 22 more variables: `Cereals - Excluding Beer` <dbl>, Eggs <dbl>,
## #   `Fish, Seafood` <dbl>, `Fruits - Excluding Wine` <dbl>, Meat <dbl>, `Milk -
## #   Excluding Butter` <dbl>, Offals <dbl>, Oilcrops <dbl>, Pulses <dbl>,
## #   Spices <dbl>, `Starchy Roots` <dbl>, Stimulants <dbl>, `Sugar Crops` <dbl>,
## #   `Sugar & Sweeteners` <dbl>, Treenuts <dbl>, `Vegetal Products` <dbl>,
## #   `Vegetable Oils` <dbl>, Vegetables <dbl>, Confirmed_per_capita <dbl>,
## #   Deaths_per_capita <dbl>, Cases_vs_median <chr>, Deaths_vs_median <chr>
corrplot(is.corr=TRUE, cor(food_data[,2:24], use="pairwise.complete.obs"), method="number")

chart.Correlation(food_data[,2:24], histogram=TRUE)

# summary statistics
food_data_1 <- food_data %>%
  select(-Deaths_vs_median, -Country)
tbl_summary(data = food_data_1, by = "Cases_vs_median",
            statistic = list(all_continuous() ~ "{mean} ({sd})",
                             all_categorical() ~ "{n}({p}%)")) %>%
  add_n() %>%
  add_p(test = all_continuous() ~ "aov") %>%
  as_flex_table() %>%
  bold(part = "header")
food_data_2 <- food_data %>%
  select(-Cases_vs_median, -Country)
tbl_summary(data = food_data_2, by = "Deaths_vs_median",
            statistic = list(all_continuous() ~ "{mean} ({sd})",
                             all_categorical() ~ "{n}({p}%)")) %>%
  add_n() %>%
  add_p(test = all_continuous() ~ "aov") %>%
  as_flex_table() %>%
  bold(part = "header")
# prepare data for cross validation
food_data <- food_data %>%
  # delete country since only each country has only one observation
  select(-`Animal Products`, -Country,-`Vegetal Products`)
food_data %>% head()
## # A tibble: 6 x 24
##   `Alcoholic Beve… `Animal fats` `Aquatic Produc… `Cereals - Excl…   Eggs
##              <dbl>         <dbl>            <dbl>            <dbl>  <dbl>
## 1           0              0.850                0             37.1 0.150 
## 2           0.912          1.06                 0             16.2 0.809 
## 3           0.0896         0.194                0             25.0 0.418 
## 4           1.94           0.264                0             18.4 0.0441
## 5           1.44           1.06                 0             16.8 0.864 
## 6           0.227          1.77                 0             19.3 0.731 
## # … with 19 more variables: `Fish, Seafood` <dbl>, `Fruits - Excluding
## #   Wine` <dbl>, Meat <dbl>, `Milk - Excluding Butter` <dbl>, Offals <dbl>,
## #   Oilcrops <dbl>, Pulses <dbl>, Spices <dbl>, `Starchy Roots` <dbl>,
## #   Stimulants <dbl>, `Sugar Crops` <dbl>, `Sugar & Sweeteners` <dbl>,
## #   Treenuts <dbl>, `Vegetable Oils` <dbl>, Vegetables <dbl>,
## #   Confirmed_per_capita <dbl>, Deaths_per_capita <dbl>, Cases_vs_median <chr>,
## #   Deaths_vs_median <chr>
library(GGally)
# a panel of scatterplots using `GGally`
ggpairs(data = food_data[,c(1:20,24)],
        ggplot2::aes(colour = Deaths_vs_median, alpha =0.5))

food_data <- food_data %>%
  mutate(Cases_vs_median = as.factor(ifelse(Cases_vs_median == "Above",1,0)),
         Deaths_vs_median = as.factor(ifelse(Deaths_vs_median == "Above",1,0)))
# categorical -> predict deaths
library(rBayesianOptimization)
set.seed(123)
cv_folds_10 = createFolds(y=food_data$Deaths_vs_median, k=10)
result_logistic <- list()
result_knn <- list()
result_linear_svm <- list()
result_radial_svm <- list()
result_polynomial_svm <- list()
result_qda <- list()
result_lasso <- list()
result_ridge <- list()
food_data %>% head()
## # A tibble: 6 x 24
##   `Alcoholic Beve… `Animal fats` `Aquatic Produc… `Cereals - Excl…   Eggs
##              <dbl>         <dbl>            <dbl>            <dbl>  <dbl>
## 1           0              0.850                0             37.1 0.150 
## 2           0.912          1.06                 0             16.2 0.809 
## 3           0.0896         0.194                0             25.0 0.418 
## 4           1.94           0.264                0             18.4 0.0441
## 5           1.44           1.06                 0             16.8 0.864 
## 6           0.227          1.77                 0             19.3 0.731 
## # … with 19 more variables: `Fish, Seafood` <dbl>, `Fruits - Excluding
## #   Wine` <dbl>, Meat <dbl>, `Milk - Excluding Butter` <dbl>, Offals <dbl>,
## #   Oilcrops <dbl>, Pulses <dbl>, Spices <dbl>, `Starchy Roots` <dbl>,
## #   Stimulants <dbl>, `Sugar Crops` <dbl>, `Sugar & Sweeteners` <dbl>,
## #   Treenuts <dbl>, `Vegetable Oils` <dbl>, Vegetables <dbl>,
## #   Confirmed_per_capita <dbl>, Deaths_per_capita <dbl>, Cases_vs_median <fct>,
## #   Deaths_vs_median <fct>
food_data$Cases_vs_median <- as_factor(food_data$Cases_vs_median)
food_data$Deaths_vs_median <- as_factor(food_data$Deaths_vs_median)

      for(f in 1:length(cv_folds_10)){
        food_data_train_cv_10 <- food_data[-cv_folds_10[[f]],]
        food_data_test_cv_10 <- food_data[cv_folds_10[[f]],]
        # knn
        set.seed(2342)
        knn_fit <- train(Deaths_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Cases_vs_median, data = food_data_train_cv_10, 
                         method = "knn", tuneLength = 20, preProcess = c("scale", "center"),
                        trControl = trainControl(method="cv", number = 5))
        # logistic
        logistic_fit <- glm(Deaths_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Cases_vs_median, data = food_data_train_cv_10, family = binomial())
        # SVM - linear
        set.seed(2342)
        svr_linear_tune <- tune(svm, Deaths_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Cases_vs_median, data = food_data_train_cv_10, kernel ="linear", ranges=list(cost=1:3, epsilon=c(0.1,0.5,1)))
        linear_svm_fit <- svr_linear_tune$best.model
        # SVM - radial
        set.seed(2342)
        svr_radial_tune <- tune(svm, Deaths_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Cases_vs_median, data = food_data_train_cv_10, kernel ="radial", ranges=list(gamma=c(0.001, 0.05, 0.1), cost=1:3, epsilon=c(0.1,0.5,1)))
        radial_svm_fit <- svr_radial_tune$best.model
        # SVM - polynomial
        set.seed(2342)
        svr_polynomial_tune <- tune(svm, Deaths_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Cases_vs_median, data = food_data_train_cv_10, kernel ="polynomial", ranges=list(gamma=c(0.001, 0.05, 0.1), cost=1:3, d=seq(2,3)))
        polynomial_svm_fit <- svr_polynomial_tune$best.model
        # QDA
        frmla <-as.formula(paste0(names(food_data_train_cv_10)[24], "~", paste0("`", names(food_data_train_cv_10)[-c(3,21:24)], "`", collapse="+")))
        qda_fit <- train(frmla, data = food_data_train_cv_10, method = "qda")
        # lasso & ridge
        lasso_fit = cv.glmnet(x = data.matrix(food_data[,c(1:20)]), y = data.matrix(food_data[,24]), alpha=1, family="binomial")
        ridge_fit = cv.glmnet(x = data.matrix(food_data[,c(1:20)]), y = data.matrix(food_data[,24]), alpha=0, family="binomial")
        
        # knn
        food_data_test_cv_10$Deaths_knn_predict <- predict(knn_fit, newdata = food_data_test_cv_10)
        result_knn[[f]] <- 1-confusionMatrix(data = food_data_test_cv_10$Deaths_knn_predict, reference = as.factor(food_data_test_cv_10$Deaths_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Deaths_knn_predict)
        #logistic
        food_data_test_cv_10$Deaths_logistic_predict_num <- predict(logistic_fit, newdata = food_data_test_cv_10, type="response", na.action=na.exclude)
        food_data_test_cv_10$Deaths_logistic_predict <- as_factor(ifelse(food_data_test_cv_10$Deaths_logistic_predict_num >= 0.5, "1", "0"))
        result_logistic[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Deaths_logistic_predict, reference = as.factor(food_data_test_cv_10$Deaths_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Deaths_logistic_predict, -Deaths_logistic_predict_num)
        # linear svm
        food_data_test_cv_10$Deaths_linear_svm_predict <- predict(linear_svm_fit, newdata = food_data_test_cv_10)
        result_linear_svm[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Deaths_linear_svm_predict, reference = as.factor(food_data_test_cv_10$Deaths_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Deaths_linear_svm_predict)
        # radial svm
        food_data_test_cv_10$Deaths_radial_svm_predict <- predict(radial_svm_fit, newdata = food_data_test_cv_10)
        result_radial_svm[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Deaths_radial_svm_predict, reference = as.factor(food_data_test_cv_10$Deaths_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Deaths_radial_svm_predict)
        # polynomial svm
        food_data_test_cv_10$Deaths_polynomial_svm_predict <- predict(polynomial_svm_fit, newdata = food_data_test_cv_10)
        result_polynomial_svm[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Deaths_polynomial_svm_predict, reference = as.factor(food_data_test_cv_10$Deaths_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Deaths_polynomial_svm_predict)
        # qda
        food_data_test_cv_10$Deaths_qda_predict_num <- predict(qda_fit, newdata = food_data_test_cv_10, type = "prob", na.action=na.exclude)$`1`
        food_data_test_cv_10$Deaths_qda_predict = relevel(factor(ifelse(food_data_test_cv_10$Deaths_qda_predict_num > 0.5, "1", "0")), 
                                                          ref = "0")
        result_qda[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Deaths_qda_predict, 
                                              reference = as.factor(food_data_test_cv_10$Deaths_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Deaths_qda_predict, -Deaths_qda_predict_num)
        # Lasso
              # probability of being in class "1"
        food_data_test_cv_10$Deaths_lasso_predict_num <- predict(lasso_fit, newx=data.matrix(food_data_test_cv_10[,c(1:20)]), type="response")
        
        food_data_test_cv_10$Deaths_lasso_predict <- as_factor(ifelse(food_data_test_cv_10$Deaths_lasso_predict_num >= 0.5, "1", "0"))
        result_lasso[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Deaths_lasso_predict, reference = as.factor(food_data_test_cv_10$Deaths_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Deaths_lasso_predict, -Deaths_lasso_predict_num)
        # Ridge
        food_data_test_cv_10$Deaths_ridge_predict_num <- predict(ridge_fit, newx=data.matrix(food_data_test_cv_10[,c(1:20)]), type="response")
        food_data_test_cv_10$Deaths_ridge_predict <- as_factor(ifelse(food_data_test_cv_10$Deaths_ridge_predict_num >= 0.5, "1", "0"))
        result_ridge[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Deaths_ridge_predict, 
                                                reference = as.factor(food_data_test_cv_10$Deaths_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Deaths_ridge_predict, -Deaths_ridge_predict_num)
        
      }

data.frame_knn = data.frame("CV_error" = mean(unlist(result_knn)), "CV_error_SE" = sd(unlist(result_knn)))
data.frame_logistic = data.frame("CV_error" = mean(unlist(result_logistic)), "CV_error_SE" = sd(unlist(result_logistic)))
data.frame_linear_svm = data.frame("CV_error" = mean(unlist(result_linear_svm)), "CV_error_SE" = sd(unlist(result_linear_svm)))
data.frame_radial_svm = data.frame("CV_error" = mean(unlist(result_radial_svm)), "CV_error_SE" = sd(unlist(result_radial_svm)))
data.frame_polynomial_svm = data.frame("CV_error" = mean(unlist(result_polynomial_svm)), "CV_error_SE" = sd(unlist(result_polynomial_svm)))
data.frame_qda = data.frame("CV_error" = mean(unlist(result_qda)), "CV_error_SE" = sd(unlist(result_qda)))
data.frame_lasso = data.frame("CV_error" = mean(unlist(result_lasso)), "CV_error_SE" = sd(unlist(result_lasso)))
data.frame_ridge = data.frame("CV_error" = mean(unlist(result_ridge)), "CV_error_SE" = sd(unlist(result_ridge)))

summary_table <- 
  rbind("KNN" = data.frame_knn, "Logistic Regression" = data.frame_logistic, "Linear SVM" = data.frame_linear_svm, 
        "Radial SVM" = data.frame_radial_svm, "Polynomial SVM" = data.frame_polynomial_svm, "QDA" = data.frame_qda, 
        "Lasso" = data.frame_lasso, "Ridge" = data.frame_ridge)
flextable(summary_table %>% rownames_to_column("Algorithm Name"))
library(randomForest)
# random forest
names(food_data) <- make.names(names(food_data))
food_data %>% head()
## # A tibble: 6 x 24
##   Alcoholic.Bever… Animal.fats Aquatic.Product… Cereals...Exclu…   Eggs
##              <dbl>       <dbl>            <dbl>            <dbl>  <dbl>
## 1           0            0.850                0             37.1 0.150 
## 2           0.912        1.06                 0             16.2 0.809 
## 3           0.0896       0.194                0             25.0 0.418 
## 4           1.94         0.264                0             18.4 0.0441
## 5           1.44         1.06                 0             16.8 0.864 
## 6           0.227        1.77                 0             19.3 0.731 
## # … with 19 more variables: Fish..Seafood <dbl>, Fruits...Excluding.Wine <dbl>,
## #   Meat <dbl>, Milk...Excluding.Butter <dbl>, Offals <dbl>, Oilcrops <dbl>,
## #   Pulses <dbl>, Spices <dbl>, Starchy.Roots <dbl>, Stimulants <dbl>,
## #   Sugar.Crops <dbl>, Sugar...Sweeteners <dbl>, Treenuts <dbl>,
## #   Vegetable.Oils <dbl>, Vegetables <dbl>, Confirmed_per_capita <dbl>,
## #   Deaths_per_capita <dbl>, Cases_vs_median <fct>, Deaths_vs_median <fct>
set.seed(123)
cv_folds_10 = createFolds(y=food_data$Deaths_vs_median, k=10)
result <- list()
result_rf <- list()
n_list <- c(50, 250, 500)
p <- ncol(food_data) - 4
c(p/2, sqrt(p), p)
## [1] 10.000000  4.472136 20.000000
m_list <- c(10, 4, 20)
reg_rf_tune <- list() 
reg_rf_oob_errors <- list()
round = 0

      for(f in 1:length(cv_folds_10)){
        food_data_train_cv_10 <- food_data[-cv_folds_10[[f]],]
        food_data_test_cv_10 <- food_data[cv_folds_10[[f]],]
        for (i in 1:length(n_list)) {
            for (j in 1:length(m_list)) {
                round <- round + 1
                set.seed(7812)
                reg_rf_tune[[round]] <- randomForest(Deaths_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Cases_vs_median, 
                                                     data = food_data_train_cv_10, ntree=n_list[i], mtry=m_list[j])
                head(food_data_train_cv_10)
                reg_rf_oob_errors[[round]] <- data.frame("trees_no"=n_list[i], "preds_no"=m_list[j], 
                                     "oob_errors"=reg_rf_tune[[round]]$err.rate[n_list[i],1])
                }
            }
        reg_rf_oob_errors_df <- do.call("rbind", reg_rf_oob_errors)
        best_errors <- which(reg_rf_oob_errors_df$oob_errors==min(reg_rf_oob_errors_df$oob_errors))
        set.seed(9984)
        reg_rf <- randomForest(Deaths_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Cases_vs_median, data = food_data_train_cv_10,
                               ntree=reg_rf_oob_errors_df$trees_no[best_errors[1]], mtry=reg_rf_oob_errors_df$preds_no[best_errors[1]])
        Deaths_Predict_Prob <- predict(reg_rf, newdata = food_data_test_cv_10, type = "prob")
        food_data_test_cv_10$Deaths_rf_Predict <- colnames(Deaths_Predict_Prob)[max.col(Deaths_Predict_Prob, ties.method="first")]
        per_class_accuracy <- rep(NA,length(levels(food_data_test_cv_10$Deaths_vs_median)))
        for (i in 1:length(per_class_accuracy)){
          per_class_accuracy[i] = food_data_test_cv_10 %>% 
          filter(Deaths_vs_median == levels(Deaths_vs_median)[i]) %>%
          summarise(accuracy = sum(Deaths_rf_Predict == 
                                     levels(Deaths_vs_median)[i]) / n()) %>%
          unlist()
          names(per_class_accuracy)[i] <- paste0("accuracy_", 
                                         levels(food_data_test_cv_10$Deaths_vs_median)[i])
        }
        result[[f]] <- per_class_accuracy
        result_rf[[f]] <- 1- confusionMatrix(data = as.factor(food_data_test_cv_10$Deaths_rf_Predict),
                                             reference = as.factor(food_data_test_cv_10$Deaths_vs_median))$overall[1]
      }
df = as.data.frame(matrix(unlist(result), ncol = 2, byrow = TRUE))
CV_error = c("Below" = (1-mean(df[,1], na.rm=TRUE)), 
             "Above" = (1-mean(df[,2], na.rm=TRUE)))

CV_error_SE = c("Below" = sd((1-df[,1]), na.rm=TRUE), 
                "Above" = sd((1-df[,2]), na.rm=TRUE))
data.frame(CV_error, CV_error_SE)
##        CV_error CV_error_SE
## Below 0.2000000   0.2000567
## Above 0.1678571   0.0883683
rf <- data.frame("CV Error" = CV_error, "CV Error SE" = CV_error_SE)
flextable(rf %>% rownames_to_column("Class Name"))
data.frame_rf <- data.frame("CV_error" = mean(unlist(result_rf)), "CV_error_SE" = sd(unlist(result_rf)))
summary_table <- rbind(summary_table, "Random Forest" = data.frame_rf)
flextable(summary_table %>% rownames_to_column("Algorithm Name"))